# Exploratory Data Analysis
!pip install ydata_profiling
from pandas_profiling import ProfileReport
# Data Processing
import pandas as pd
import numpy as np
import re
from sklearn.impute import SimpleImputer
# Data visualization
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
# Creating the modeling dataset
from sklearn.datasets import make_classification
# Model and performance
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
# Over sampling and under sampling
from imblearn.over_sampling import RandomOverSampler, SMOTE
from imblearn.under_sampling import RandomUnderSampler, NearMiss
from collections import Counter
# Treating data imbalance
!pip install -U imbalanced-learn
Requirement already satisfied: ydata_profiling in /Users/farhanachowdhury/opt/anaconda3/lib/python3.9/site-packages (4.0.0) Requirement already satisfied: statsmodels<0.14,>=0.13.2 in /Users/farhanachowdhury/opt/anaconda3/lib/python3.9/site-packages (from ydata_profiling) (0.13.2) Requirement already satisfied: tqdm<4.65,>=4.48.2 in /Users/farhanachowdhury/opt/anaconda3/lib/python3.9/site-packages (from ydata_profiling) (4.64.1) Requirement already satisfied: pandas!=1.4.0,<1.6,>1.1 in /Users/farhanachowdhury/opt/anaconda3/lib/python3.9/site-packages (from ydata_profiling) (1.4.4) Requirement already satisfied: requests<2.29,>=2.24.0 in /Users/farhanachowdhury/opt/anaconda3/lib/python3.9/site-packages (from ydata_profiling) (2.28.1) Requirement already satisfied: matplotlib<3.7,>=3.2 in /Users/farhanachowdhury/opt/anaconda3/lib/python3.9/site-packages (from ydata_profiling) (3.5.2) Requirement already satisfied: jinja2<3.2,>=2.11.1 in /Users/farhanachowdhury/opt/anaconda3/lib/python3.9/site-packages (from ydata_profiling) (2.11.3) Requirement already satisfied: htmlmin==0.1.12 in /Users/farhanachowdhury/opt/anaconda3/lib/python3.9/site-packages (from ydata_profiling) (0.1.12) Requirement already satisfied: scipy<1.10,>=1.4.1 in /Users/farhanachowdhury/opt/anaconda3/lib/python3.9/site-packages (from ydata_profiling) (1.9.1) Requirement already satisfied: typeguard<2.14,>=2.13.2 in /Users/farhanachowdhury/opt/anaconda3/lib/python3.9/site-packages (from ydata_profiling) (2.13.3) Requirement already satisfied: multimethod<1.10,>=1.4 in /Users/farhanachowdhury/opt/anaconda3/lib/python3.9/site-packages (from ydata_profiling) (1.9.1) Requirement already satisfied: pydantic<1.11,>=1.8.1 in /Users/farhanachowdhury/opt/anaconda3/lib/python3.9/site-packages (from ydata_profiling) (1.10.5) Requirement already satisfied: seaborn<0.13,>=0.10.1 in /Users/farhanachowdhury/opt/anaconda3/lib/python3.9/site-packages (from ydata_profiling) (0.11.2) Requirement already satisfied: phik<0.13,>=0.11.1 in /Users/farhanachowdhury/opt/anaconda3/lib/python3.9/site-packages (from ydata_profiling) (0.12.3) Requirement already satisfied: PyYAML<6.1,>=5.0.0 in /Users/farhanachowdhury/opt/anaconda3/lib/python3.9/site-packages (from ydata_profiling) (6.0) Requirement already satisfied: visions[type_image_path]==0.7.5 in /Users/farhanachowdhury/opt/anaconda3/lib/python3.9/site-packages (from ydata_profiling) (0.7.5) Requirement already satisfied: numpy<1.24,>=1.16.0 in /Users/farhanachowdhury/opt/anaconda3/lib/python3.9/site-packages (from ydata_profiling) (1.21.5) Requirement already satisfied: attrs>=19.3.0 in /Users/farhanachowdhury/opt/anaconda3/lib/python3.9/site-packages (from visions[type_image_path]==0.7.5->ydata_profiling) (21.4.0) Requirement already satisfied: tangled-up-in-unicode>=0.0.4 in /Users/farhanachowdhury/opt/anaconda3/lib/python3.9/site-packages (from visions[type_image_path]==0.7.5->ydata_profiling) (0.2.0) Requirement already satisfied: networkx>=2.4 in /Users/farhanachowdhury/opt/anaconda3/lib/python3.9/site-packages (from visions[type_image_path]==0.7.5->ydata_profiling) (2.8.4) Requirement already satisfied: imagehash in /Users/farhanachowdhury/opt/anaconda3/lib/python3.9/site-packages (from visions[type_image_path]==0.7.5->ydata_profiling) (4.3.1) Requirement already satisfied: Pillow in /Users/farhanachowdhury/opt/anaconda3/lib/python3.9/site-packages (from visions[type_image_path]==0.7.5->ydata_profiling) (9.2.0) Requirement already satisfied: MarkupSafe>=0.23 in /Users/farhanachowdhury/opt/anaconda3/lib/python3.9/site-packages (from jinja2<3.2,>=2.11.1->ydata_profiling) (2.0.1) Requirement already satisfied: python-dateutil>=2.7 in /Users/farhanachowdhury/opt/anaconda3/lib/python3.9/site-packages (from matplotlib<3.7,>=3.2->ydata_profiling) (2.8.2) Requirement already satisfied: fonttools>=4.22.0 in /Users/farhanachowdhury/opt/anaconda3/lib/python3.9/site-packages (from matplotlib<3.7,>=3.2->ydata_profiling) (4.25.0) Requirement already satisfied: pyparsing>=2.2.1 in /Users/farhanachowdhury/opt/anaconda3/lib/python3.9/site-packages (from matplotlib<3.7,>=3.2->ydata_profiling) (3.0.9) Requirement already satisfied: cycler>=0.10 in /Users/farhanachowdhury/opt/anaconda3/lib/python3.9/site-packages (from matplotlib<3.7,>=3.2->ydata_profiling) (0.11.0) Requirement already satisfied: packaging>=20.0 in /Users/farhanachowdhury/opt/anaconda3/lib/python3.9/site-packages (from matplotlib<3.7,>=3.2->ydata_profiling) (21.3) Requirement already satisfied: kiwisolver>=1.0.1 in /Users/farhanachowdhury/opt/anaconda3/lib/python3.9/site-packages (from matplotlib<3.7,>=3.2->ydata_profiling) (1.4.2) Requirement already satisfied: pytz>=2020.1 in /Users/farhanachowdhury/opt/anaconda3/lib/python3.9/site-packages (from pandas!=1.4.0,<1.6,>1.1->ydata_profiling) (2022.1) Requirement already satisfied: joblib>=0.14.1 in /Users/farhanachowdhury/opt/anaconda3/lib/python3.9/site-packages (from phik<0.13,>=0.11.1->ydata_profiling) (1.2.0) Requirement already satisfied: typing-extensions>=4.2.0 in /Users/farhanachowdhury/opt/anaconda3/lib/python3.9/site-packages (from pydantic<1.11,>=1.8.1->ydata_profiling) (4.3.0) Requirement already satisfied: urllib3<1.27,>=1.21.1 in /Users/farhanachowdhury/opt/anaconda3/lib/python3.9/site-packages (from requests<2.29,>=2.24.0->ydata_profiling) (1.26.11) Requirement already satisfied: idna<4,>=2.5 in /Users/farhanachowdhury/opt/anaconda3/lib/python3.9/site-packages (from requests<2.29,>=2.24.0->ydata_profiling) (3.3) Requirement already satisfied: charset-normalizer<3,>=2 in /Users/farhanachowdhury/opt/anaconda3/lib/python3.9/site-packages (from requests<2.29,>=2.24.0->ydata_profiling) (2.0.4) Requirement already satisfied: certifi>=2017.4.17 in /Users/farhanachowdhury/opt/anaconda3/lib/python3.9/site-packages (from requests<2.29,>=2.24.0->ydata_profiling) (2022.9.24) Requirement already satisfied: patsy>=0.5.2 in /Users/farhanachowdhury/opt/anaconda3/lib/python3.9/site-packages (from statsmodels<0.14,>=0.13.2->ydata_profiling) (0.5.2) Requirement already satisfied: six in /Users/farhanachowdhury/opt/anaconda3/lib/python3.9/site-packages (from patsy>=0.5.2->statsmodels<0.14,>=0.13.2->ydata_profiling) (1.16.0) Requirement already satisfied: PyWavelets in /Users/farhanachowdhury/opt/anaconda3/lib/python3.9/site-packages (from imagehash->visions[type_image_path]==0.7.5->ydata_profiling) (1.3.0) Requirement already satisfied: imbalanced-learn in /Users/farhanachowdhury/opt/anaconda3/lib/python3.9/site-packages (0.10.1) Requirement already satisfied: numpy>=1.17.3 in /Users/farhanachowdhury/opt/anaconda3/lib/python3.9/site-packages (from imbalanced-learn) (1.21.5) Requirement already satisfied: scikit-learn>=1.0.2 in /Users/farhanachowdhury/opt/anaconda3/lib/python3.9/site-packages (from imbalanced-learn) (1.0.2) Requirement already satisfied: joblib>=1.1.1 in /Users/farhanachowdhury/opt/anaconda3/lib/python3.9/site-packages (from imbalanced-learn) (1.2.0) Requirement already satisfied: threadpoolctl>=2.0.0 in /Users/farhanachowdhury/opt/anaconda3/lib/python3.9/site-packages (from imbalanced-learn) (2.2.0) Requirement already satisfied: scipy>=1.3.2 in /Users/farhanachowdhury/opt/anaconda3/lib/python3.9/site-packages (from imbalanced-learn) (1.9.1)
df1 = pd.read_csv('adult_data.csv', header = None, na_values =' ?') # Passing ' ?' as na_values
df2 = pd.read_csv('adult_test.csv', header = None, na_values =' ?') # Passing ' ?' as na_values
df = pd.concat([df1, df2], ignore_index = True)
column_names = ['age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status',
'occupation', 'relationship','race', 'gender', 'capital-gain', 'capital-loss', 'hours-per-week',
'country', 'income']
df.columns = column_names
df.head()
| age | workclass | fnlwgt | education | education-num | marital-status | occupation | relationship | race | gender | capital-gain | capital-loss | hours-per-week | country | income | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 39 | State-gov | 77516 | Bachelors | 13 | Never-married | Adm-clerical | Not-in-family | White | Male | 2174 | 0 | 40 | United-States | <=50K |
| 1 | 50 | Self-emp-not-inc | 83311 | Bachelors | 13 | Married-civ-spouse | Exec-managerial | Husband | White | Male | 0 | 0 | 13 | United-States | <=50K |
| 2 | 38 | Private | 215646 | HS-grad | 9 | Divorced | Handlers-cleaners | Not-in-family | White | Male | 0 | 0 | 40 | United-States | <=50K |
| 3 | 53 | Private | 234721 | 11th | 7 | Married-civ-spouse | Handlers-cleaners | Husband | Black | Male | 0 | 0 | 40 | United-States | <=50K |
| 4 | 28 | Private | 338409 | Bachelors | 13 | Married-civ-spouse | Prof-specialty | Wife | Black | Female | 0 | 0 | 40 | Cuba | <=50K |
EDA = ProfileReport(df, title = "EDA of the Adult Dataset", html={'style':{'full_width': True}})
EDA
Summarize dataset: 0%| | 0/5 [00:00<?, ?it/s]
Generate report structure: 0%| | 0/1 [00:00<?, ?it/s]
Render HTML: 0%| | 0/1 [00:00<?, ?it/s]
df.describe().T
| count | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|
| age | 48842.0 | 38.643585 | 13.710510 | 17.0 | 28.0 | 37.0 | 48.0 | 90.0 |
| fnlwgt | 48842.0 | 189664.134597 | 105604.025423 | 12285.0 | 117550.5 | 178144.5 | 237642.0 | 1490400.0 |
| education-num | 48842.0 | 10.078089 | 2.570973 | 1.0 | 9.0 | 10.0 | 12.0 | 16.0 |
| capital-gain | 48842.0 | 1079.067626 | 7452.019058 | 0.0 | 0.0 | 0.0 | 0.0 | 99999.0 |
| capital-loss | 48842.0 | 87.502314 | 403.004552 | 0.0 | 0.0 | 0.0 | 0.0 | 4356.0 |
| hours-per-week | 48842.0 | 40.422382 | 12.391444 | 1.0 | 40.0 | 40.0 | 45.0 | 99.0 |
# Target variable has 2 class: <=50K and <50K
# However, due to typo 2 additional classes have been created: <=50K. and <50K.
# So, we are correcting the 'income' class string
df_modified = df.copy()
df_modified['income'] = df_modified['income'].str.replace('<=50K.', '<=50K',regex=False)
df_modified['income'] = df_modified['income'].str.replace('>50K.', '>50K', regex=False)
df_modified['income'].unique()
array([' <=50K', ' >50K'], dtype=object)
df_modified.education.value_counts()
HS-grad 15784 Some-college 10878 Bachelors 8025 Masters 2657 Assoc-voc 2061 11th 1812 Assoc-acdm 1601 10th 1389 7th-8th 955 Prof-school 834 9th 756 12th 657 Doctorate 594 5th-6th 509 1st-4th 247 Preschool 83 Name: education, dtype: int64
df_modified['education'] = df_modified['education'].str.replace('11th','High-school', regex = True)
df_modified['education'] = df_modified['education'].str.replace('9th','High-school', regex = True)
df_modified['education'] = df_modified['education'].str.replace('7th-8th','Junior-High', regex = True)
df_modified['education'] = df_modified['education'].str.replace('5th-6th', 'Primary', regex = True)
df_modified['education'] = df_modified['education'].str.replace('10th','High-school', regex = True)
df_modified['education'] = df_modified['education'].str.replace('1st-4th','Primary', regex = True)
df_modified['education'] = df_modified['education'].str.replace('Preschool','Pre-school', regex = True)
df_modified['education'] = df_modified['education'].str.replace('12th','High-school', regex = True)
df_modified['education'] = df_modified['education'].str.replace('Bachelors','Bachelors', regex = True)
df_modified['education'] = df_modified['education'].str.replace('HS-grad','HS-grad', regex = True)
df_modified['education'] = df_modified['education'].replace(['Masters', 'Prof-school'],'Masters', regex = True)
df_modified['education'] = df_modified['education'].replace(['Some-college','Assoc-acdm', 'Assoc-voc'],'College', regex = True)
df_modified['education'] = df_modified['education'].str.replace('Doctorate','Doctorate', regex = True)
df_modified['education'].unique()
array([' Bachelors', ' HS-grad', ' High-school', ' Masters', ' College',
' Junior-High', ' Doctorate', ' Primary', ' Pre-school'],
dtype=object)
df_modified = df_modified.drop_duplicates()
df_modified.shape
(48790, 15)
df_modified.isna().sum()
age 0 workclass 2795 fnlwgt 0 education 0 education-num 0 marital-status 0 occupation 2805 relationship 0 race 0 gender 0 capital-gain 0 capital-loss 0 hours-per-week 0 country 856 income 0 dtype: int64
df_modified.isna().mean()*100
age 0.000000 workclass 5.728633 fnlwgt 0.000000 education 0.000000 education-num 0.000000 marital-status 0.000000 occupation 5.749129 relationship 0.000000 race 0.000000 gender 0.000000 capital-gain 0.000000 capital-loss 0.000000 hours-per-week 0.000000 country 1.754458 income 0.000000 dtype: float64
The missing values are in 3 categorical variables:
All of these 3 variables are categorical and non-ordinal which is why we cannot convert them into ordinal numeric values to apply multivariate imputer like KNN. We are trying 3 options to impute the missing values:
x = df_modified.copy()
df_dropNA = x.dropna()
df_dropNA.isna().sum()
age 0 workclass 0 fnlwgt 0 education 0 education-num 0 marital-status 0 occupation 0 relationship 0 race 0 gender 0 capital-gain 0 capital-loss 0 hours-per-week 0 country 0 income 0 dtype: int64
df_dropNA.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 45175 entries, 0 to 48841 Data columns (total 15 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 age 45175 non-null int64 1 workclass 45175 non-null object 2 fnlwgt 45175 non-null int64 3 education 45175 non-null object 4 education-num 45175 non-null int64 5 marital-status 45175 non-null object 6 occupation 45175 non-null object 7 relationship 45175 non-null object 8 race 45175 non-null object 9 gender 45175 non-null object 10 capital-gain 45175 non-null int64 11 capital-loss 45175 non-null int64 12 hours-per-week 45175 non-null int64 13 country 45175 non-null object 14 income 45175 non-null object dtypes: int64(6), object(9) memory usage: 5.5+ MB
df_keepNA = df_modified.copy()
df_keepNA.info(show_counts = True)
<class 'pandas.core.frame.DataFrame'> Int64Index: 48790 entries, 0 to 48841 Data columns (total 15 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 age 48790 non-null int64 1 workclass 45995 non-null object 2 fnlwgt 48790 non-null int64 3 education 48790 non-null object 4 education-num 48790 non-null int64 5 marital-status 48790 non-null object 6 occupation 45985 non-null object 7 relationship 48790 non-null object 8 race 48790 non-null object 9 gender 48790 non-null object 10 capital-gain 48790 non-null int64 11 capital-loss 48790 non-null int64 12 hours-per-week 48790 non-null int64 13 country 47934 non-null object 14 income 48790 non-null object dtypes: int64(6), object(9) memory usage: 6.0+ MB
df_keepNA['occupation'].fillna('not available', inplace=True)
df_keepNA['workclass'].fillna('not available', inplace=True)
df_keepNA['country'].fillna('not available', inplace=True)
df_keepNA.info(show_counts = True)
<class 'pandas.core.frame.DataFrame'> Int64Index: 48790 entries, 0 to 48841 Data columns (total 15 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 age 48790 non-null int64 1 workclass 48790 non-null object 2 fnlwgt 48790 non-null int64 3 education 48790 non-null object 4 education-num 48790 non-null int64 5 marital-status 48790 non-null object 6 occupation 48790 non-null object 7 relationship 48790 non-null object 8 race 48790 non-null object 9 gender 48790 non-null object 10 capital-gain 48790 non-null int64 11 capital-loss 48790 non-null int64 12 hours-per-week 48790 non-null int64 13 country 48790 non-null object 14 income 48790 non-null object dtypes: int64(6), object(9) memory usage: 6.0+ MB
Before replacing 'na' with most frequent value or mode, we will need to split the train-test set as we cannot leak any info to the test set.
df_ModeImpute = df_modified.copy()
ind_var = ['age', 'workclass', 'fnlwgt', 'education', 'education-num',
'marital-status', 'occupation', 'relationship', 'race', 'gender',
'capital-gain', 'capital-loss', 'hours-per-week', 'country',] # independent variables
X = df_ModeImpute[ind_var] # Features
y = df_ModeImpute['income'] # Target variable
X_train_ModeImpute, X_test_ModeImpute, y_train_ModeImpute, y_test_ModeImpute = train_test_split(X, y, test_size=0.3, random_state=40)
# Replacing 'na' with mode only in the training set
X_train_ModeImpute['occupation'] = X_train_ModeImpute['occupation'].fillna(X_train_ModeImpute['occupation'].mode()[0])
X_train_ModeImpute['workclass'] = X_train_ModeImpute['workclass'].fillna(X_train_ModeImpute['workclass'].mode()[0])
X_train_ModeImpute['country'] = X_train_ModeImpute['country'].fillna(X_train_ModeImpute['country'].mode()[0])
X_train_ModeImpute.head()
| age | workclass | fnlwgt | education | education-num | marital-status | occupation | relationship | race | gender | capital-gain | capital-loss | hours-per-week | country | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 7174 | 28 | Private | 107411 | Bachelors | 13 | Never-married | Prof-specialty | Not-in-family | White | Female | 0 | 0 | 35 | United-States |
| 26712 | 70 | Private | 89787 | HS-grad | 9 | Married-civ-spouse | Prof-specialty | Husband | White | Male | 0 | 0 | 20 | United-States |
| 41840 | 80 | Self-emp-inc | 164909 | Masters | 14 | Married-civ-spouse | Exec-managerial | Husband | White | Male | 99999 | 0 | 54 | United-States |
| 29718 | 33 | Self-emp-not-inc | 170979 | Masters | 15 | Married-civ-spouse | Prof-specialty | Husband | White | Male | 0 | 1887 | 40 | United-States |
| 3565 | 34 | Private | 157165 | College | 10 | Divorced | Other-service | Unmarried | White | Female | 0 | 0 | 40 | United-States |
# removing the 'na' values from test set for df_ModeImpute
X_test_ModeImpute = X_test_ModeImpute.dropna()
X_test_ModeImpute.isna().sum()
age 0 workclass 0 fnlwgt 0 education 0 education-num 0 marital-status 0 occupation 0 relationship 0 race 0 gender 0 capital-gain 0 capital-loss 0 hours-per-week 0 country 0 dtype: int64
EDA_modified1 = ProfileReport(df_dropNA, title="EDA comparison base df & df_dropNA")
comparison_report = EDA.compare(EDA_modified1)
comparison_report.to_file("original_vs_transformed.html")
comparison_report
Summarize dataset: 0%| | 0/5 [00:00<?, ?it/s]
Generate report structure: 0%| | 0/1 [00:00<?, ?it/s]
Render HTML: 0%| | 0/1 [00:00<?, ?it/s]
Export report to file: 0%| | 0/1 [00:00<?, ?it/s]
EDA_modified2 = ProfileReport(df_keepNA, title="EDA comparison base df & df_keepNA")
comparison_report = EDA.compare(EDA_modified2)
comparison_report.to_file("original_vs_transformed.html")
comparison_report
Summarize dataset: 0%| | 0/5 [00:00<?, ?it/s]
Generate report structure: 0%| | 0/1 [00:00<?, ?it/s]
Render HTML: 0%| | 0/1 [00:00<?, ?it/s]
Export report to file: 0%| | 0/1 [00:00<?, ?it/s]
Target variable of this dataset is imbalanced, which makes the dataset imbalanced. We will need to treat the class imbalance before building model
df_dropNA['income'].value_counts(normalize = True)
<=50K 0.752031 >50K 0.247969 Name: income, dtype: float64
df_keepNA['income'].value_counts(normalize = True)
<=50K 0.760586 >50K 0.239414 Name: income, dtype: float64
y_train_ModeImpute.value_counts(normalize = True)
<=50K 0.761134 >50K 0.238866 Name: income, dtype: float64
First we will need to encode our categorical string data as we won't be able to build model on that. Otherwise, machine learning model will give us error because of the string values and machine learning models cannot work on categorical string values. So we will need to convert the categorical string features to numerical values. It can be done either OneHotEncoding or through Pandas get_dummies function.
# before encoding, we will separate the numerical features in a dataframe
# as we will need to merge them with encoded columns later
# Encoding df_dropNA dataset
df_dropNA_num = df_dropNA.drop(['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'gender', 'country'], axis=1)
df_dropNA_encoded = pd.get_dummies(df_dropNA[['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'gender', 'country']],)
df_dropNA_encoded = pd.concat([df_dropNA_encoded, df_dropNA_num], axis=1)
df_dropNA_encoded
| workclass_ Federal-gov | workclass_ Local-gov | workclass_ Private | workclass_ Self-emp-inc | workclass_ Self-emp-not-inc | workclass_ State-gov | workclass_ Without-pay | education_ Bachelors | education_ College | education_ Doctorate | ... | country_ United-States | country_ Vietnam | country_ Yugoslavia | age | fnlwgt | education-num | capital-gain | capital-loss | hours-per-week | income | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | ... | 1 | 0 | 0 | 39 | 77516 | 13 | 2174 | 0 | 40 | <=50K |
| 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | ... | 1 | 0 | 0 | 50 | 83311 | 13 | 0 | 0 | 13 | <=50K |
| 2 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 1 | 0 | 0 | 38 | 215646 | 9 | 0 | 0 | 40 | <=50K |
| 3 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 1 | 0 | 0 | 53 | 234721 | 7 | 0 | 0 | 40 | <=50K |
| 4 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | ... | 0 | 0 | 0 | 28 | 338409 | 13 | 0 | 0 | 40 | <=50K |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 48836 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | ... | 1 | 0 | 0 | 33 | 245211 | 13 | 0 | 0 | 40 | <=50K |
| 48837 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | ... | 1 | 0 | 0 | 39 | 215419 | 13 | 0 | 0 | 36 | <=50K |
| 48839 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | ... | 1 | 0 | 0 | 38 | 374983 | 13 | 0 | 0 | 50 | <=50K |
| 48840 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | ... | 1 | 0 | 0 | 44 | 83891 | 13 | 5455 | 0 | 40 | <=50K |
| 48841 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | ... | 1 | 0 | 0 | 35 | 182148 | 13 | 0 | 0 | 60 | >50K |
45175 rows × 98 columns
# Encoding df_keepNA dataset
df_keepNA_num = df_keepNA.drop(['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'gender', 'country'], axis=1)
df_keepNA_encoded = pd.get_dummies(df_keepNA[['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'gender', 'country']],)
df_keepNA_encoded = pd.concat([df_keepNA_encoded, df_keepNA_num], axis=1)
df_keepNA_encoded
| workclass_ Federal-gov | workclass_ Local-gov | workclass_ Never-worked | workclass_ Private | workclass_ Self-emp-inc | workclass_ Self-emp-not-inc | workclass_ State-gov | workclass_ Without-pay | workclass_not available | education_ Bachelors | ... | country_ Vietnam | country_ Yugoslavia | country_not available | age | fnlwgt | education-num | capital-gain | capital-loss | hours-per-week | income | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | ... | 0 | 0 | 0 | 39 | 77516 | 13 | 2174 | 0 | 40 | <=50K |
| 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | ... | 0 | 0 | 0 | 50 | 83311 | 13 | 0 | 0 | 13 | <=50K |
| 2 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 38 | 215646 | 9 | 0 | 0 | 40 | <=50K |
| 3 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 53 | 234721 | 7 | 0 | 0 | 40 | <=50K |
| 4 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | ... | 0 | 0 | 0 | 28 | 338409 | 13 | 0 | 0 | 40 | <=50K |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 48837 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | ... | 0 | 0 | 0 | 39 | 215419 | 13 | 0 | 0 | 36 | <=50K |
| 48838 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | ... | 0 | 0 | 0 | 64 | 321403 | 9 | 0 | 0 | 40 | <=50K |
| 48839 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | ... | 0 | 0 | 0 | 38 | 374983 | 13 | 0 | 0 | 50 | <=50K |
| 48840 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | ... | 0 | 0 | 0 | 44 | 83891 | 13 | 5455 | 0 | 40 | <=50K |
| 48841 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | ... | 0 | 0 | 0 | 35 | 182148 | 13 | 0 | 0 | 60 | >50K |
48790 rows × 102 columns
# Encoding both the training and test set for df_ModeImpute dataset as we have split them already
X_train_ModeImpute_num = X_train_ModeImpute.drop(['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'gender', 'country'], axis=1)
X_train_ModeImpute_encoded = pd.get_dummies(X_train_ModeImpute[['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'gender', 'country']],)
X_train_ModeImpute_encoded = pd.concat([X_train_ModeImpute_encoded, X_train_ModeImpute_num], axis=1)
X_train_ModeImpute_encoded
| workclass_ Federal-gov | workclass_ Local-gov | workclass_ Never-worked | workclass_ Private | workclass_ Self-emp-inc | workclass_ Self-emp-not-inc | workclass_ State-gov | workclass_ Without-pay | education_ Bachelors | education_ College | ... | country_ Trinadad&Tobago | country_ United-States | country_ Vietnam | country_ Yugoslavia | age | fnlwgt | education-num | capital-gain | capital-loss | hours-per-week | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 7174 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | ... | 0 | 1 | 0 | 0 | 28 | 107411 | 13 | 0 | 0 | 35 |
| 26712 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 1 | 0 | 0 | 70 | 89787 | 9 | 0 | 0 | 20 |
| 41840 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 1 | 0 | 0 | 80 | 164909 | 14 | 99999 | 0 | 54 |
| 29718 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | ... | 0 | 1 | 0 | 0 | 33 | 170979 | 15 | 0 | 1887 | 40 |
| 3565 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | ... | 0 | 1 | 0 | 0 | 34 | 157165 | 10 | 0 | 0 | 40 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27657 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 1 | 0 | 0 | 48 | 61985 | 5 | 0 | 0 | 20 |
| 14506 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | ... | 0 | 1 | 0 | 0 | 57 | 264148 | 9 | 0 | 0 | 45 |
| 30748 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | ... | 0 | 1 | 0 | 0 | 28 | 119287 | 13 | 15024 | 0 | 28 |
| 47372 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 1 | 0 | 0 | 22 | 415755 | 4 | 0 | 0 | 40 |
| 11593 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 1 | 0 | 0 | 42 | 55363 | 14 | 0 | 0 | 55 |
34153 rows × 97 columns
X_test_ModeImpute_num = X_test_ModeImpute.drop(['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'gender', 'country'], axis=1)
X_test_ModeImpute_encoded = pd.get_dummies(X_test_ModeImpute[['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'gender', 'country']],)
X_test_ModeImpute_encoded = pd.concat([X_test_ModeImpute_encoded, X_test_ModeImpute_num], axis=1)
X_test_ModeImpute_encoded
| workclass_ Federal-gov | workclass_ Local-gov | workclass_ Private | workclass_ Self-emp-inc | workclass_ Self-emp-not-inc | workclass_ State-gov | workclass_ Without-pay | education_ Bachelors | education_ College | education_ Doctorate | ... | country_ Trinadad&Tobago | country_ United-States | country_ Vietnam | country_ Yugoslavia | age | fnlwgt | education-num | capital-gain | capital-loss | hours-per-week | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 4997 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 1 | 0 | 0 | 47 | 212120 | 9 | 0 | 0 | 40 |
| 31294 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 1 | 0 | 0 | 38 | 461337 | 9 | 0 | 0 | 33 |
| 22220 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | ... | 0 | 1 | 0 | 0 | 90 | 52386 | 13 | 0 | 0 | 40 |
| 21532 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | ... | 0 | 1 | 0 | 0 | 25 | 98756 | 10 | 0 | 0 | 50 |
| 26576 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 1 | 0 | 0 | 42 | 247695 | 9 | 0 | 0 | 45 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 30046 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | ... | 0 | 1 | 0 | 0 | 29 | 211482 | 13 | 0 | 0 | 40 |
| 911 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | ... | 0 | 1 | 0 | 0 | 38 | 278924 | 10 | 0 | 0 | 44 |
| 670 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | ... | 0 | 1 | 0 | 0 | 48 | 110457 | 10 | 0 | 0 | 40 |
| 19873 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | ... | 0 | 1 | 0 | 0 | 33 | 520078 | 12 | 0 | 0 | 60 |
| 48665 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | ... | 0 | 0 | 0 | 0 | 43 | 107306 | 10 | 0 | 0 | 55 |
13509 rows × 97 columns
# Train-test split for df_dropNA dataset
X = df_dropNA_encoded.drop('income', axis=1) # all independent variables execpt the target 'income'
y = df_dropNA_encoded['income'] # Target variable
X_train_dropNA_encoded, X_test_dropNA_encoded, y_train_dropNA_encoded, y_test_dropNA_encoded = train_test_split(X, y, test_size=0.3, random_state=40 )
print("The number of records in the training set is", X_train_dropNA_encoded.shape[0])
print("The number of records in the test set is", X_test_dropNA_encoded.shape[0])
The number of records in the training set is 31622 The number of records in the test set is 13553
# Train-test split for df_keepNA dataset
X = df_keepNA_encoded.drop('income', axis=1) # all independent variables execpt the target 'income'
y = df_keepNA_encoded['income'] # Target variable
X_train_keepNA_encoded, X_test_keepNA_encoded, y_train_keepNA_encoded, y_test_keepNA_encoded = train_test_split(X, y, test_size=0.3, random_state=40 )
print("The number of records in the training set is", X_train_keepNA_encoded.shape[0])
print("The number of records in the test set is", X_test_keepNA_encoded.shape[0])
The number of records in the training set is 34153 The number of records in the test set is 14637
Before building the model, we need to decide on the performance metric we would like to optmize and compare different models based on that. We will focus on recall which will show what % of actual positives were picked by the model. Precision, tells us what % of predicted positives are actually correct which is crucial when it comes to health sector or other crucial sectors peoples' lives depend on (i.e deciding on how safe a car will be etc.).
For this research, we are selecting 'recall' as our main performance metric as precision is not going to change anyone's life or we are not making any major life-altering decision based on precision in this research. We just need to determine as much actual target income as we can.
We will first build the model on the imbalanced model directly, which will be considered as our baseline. Once we have the baseline, we will then treat the data imbalance and compare our results against the baseline model.
# Check the number of majority and minority class
print("Majority and minority class y_train_dropNA_encoded:",(sorted(Counter(y_train_dropNA_encoded).items())))
print("Majority and minority class y_train_keepNA_encoded:",(sorted(Counter(y_train_keepNA_encoded).items())))
print("Majority and minority class y_train_ModeImpute:",(sorted(Counter(y_train_ModeImpute).items())))
Majority and minority class y_train_dropNA_encoded: [(' <=50K', 23787), (' >50K', 7835)]
Majority and minority class y_train_keepNA_encoded: [(' <=50K', 25995), (' >50K', 8158)]
Majority and minority class y_train_ModeImpute: [(' <=50K', 25995), (' >50K', 8158)]
# now we will train the Random Forest model on the training set of df_dropNA
rf = RandomForestClassifier()
base_model1 = rf.fit(X_train_dropNA_encoded, y_train_dropNA_encoded)
base_prediction1 = base_model1.predict(X_test_dropNA_encoded)
# Check the model performance
print(classification_report(y_test_dropNA_encoded, base_prediction1))
precision recall f1-score support
<=50K 0.88 0.93 0.90 10186
>50K 0.73 0.63 0.68 3367
accuracy 0.85 13553
macro avg 0.81 0.78 0.79 13553
weighted avg 0.85 0.85 0.85 13553
Our base model for drop_NA dataset shows that the minority class shows a 63% recall whereas the majority class has 93% recall. Which means only 63% of people who actually have target income of >50K have been correctly predicted. We know that this might be happening due to class imbalance. We will now fix the class imbalance.
We are going to explore the following 4 approaches to treat the class imbalance of the dataset:
We are going to apply our class imbalance treatment techniques to training dataset only. We cannot do it on test set otherwise it will leak information to the test which will result in incorrect or biased outcome. In Random Over-Sampling, we generate new samples for the minority class by sampling with replacement. We will use the RandomOverSampler from imblearn library for this random oversampling.
y_train_dropNA_encoded.value_counts()
<=50K 23787 >50K 7835 Name: income, dtype: int64
# Randomly oversample the minority class training set
ros = RandomOverSampler(random_state=42)
X_train_ros1, y_train_ros1 = ros.fit_resample(X_train_dropNA_encoded, y_train_dropNA_encoded)
# Check the number of records after over sampling
print(sorted(Counter(y_train_ros1).items()))
[(' <=50K', 23787), (' >50K', 23787)]
After Random Oversampling, the minority class increased from 7835 to 23787, which is the same as majority category.
# now we will train the Random Forest model on random oversampled data
rf = RandomForestClassifier()
ros_model1 = rf.fit(X_train_ros1, y_train_ros1)
ros_prediction1 = base_model1.predict(X_test_dropNA_encoded)
# Check the model performance
print(classification_report(y_test_dropNA_encoded, ros_prediction1))
precision recall f1-score support
<=50K 0.88 0.93 0.90 10186
>50K 0.73 0.63 0.68 3367
accuracy 0.85 13553
macro avg 0.81 0.78 0.79 13553
weighted avg 0.85 0.85 0.85 13553
We can see that the Random Oversampling did not provide any different result than the base model. In fact, it gave us the exact same results.
SMOTE (Synthetic Minority Oversampling Technique) was published in 2002. Instead of randomly oversampling with replacement, SMOTE takes each minority sample and introduces synthetic data points connecting the minority sample and its nearest neighbours. Neighbours from the k nearest neighbours are chosen randomly.
y_train_dropNA_encoded.value_counts()
<=50K 23787 >50K 7835 Name: income, dtype: int64
# Oversampling the minority class through SMOTE
smote1 = SMOTE(random_state=40)
X_train_smote1, y_train_smote1 = smote1.fit_resample(X_train_dropNA_encoded, y_train_dropNA_encoded)
# Check the number of records after over sampling
print(sorted(Counter(y_train_smote1).items()))
[(' <=50K', 23787), (' >50K', 23787)]
Similar to random oversampling, the minority class increased from 7835 to 23787, which is the same as majority category.
# now let's run the Random Forest model on SMOTE dataset
rf = RandomForestClassifier()
smote_model1 = rf.fit(X_train_smote1, y_train_smote1)
smote_prediction1 = smote_model1.predict(X_test_dropNA_encoded)
# Check the model performance
print(classification_report(y_test_dropNA_encoded, smote_prediction1))
precision recall f1-score support
<=50K 0.89 0.91 0.90 10186
>50K 0.71 0.65 0.68 3367
accuracy 0.85 13553
macro avg 0.80 0.78 0.79 13553
weighted avg 0.84 0.85 0.84 13553
We can see very minimal improvement in recall of the minority class compared to base and random oversampling. Recall has increased to 65% from 62% whereas precision has decreased. However, accuracy remains the same.
Random undersampling randomly picks data points from the majority class. After the sampling, the minority class has the same number of the data points as the majority class.
y_train_dropNA_encoded.value_counts()
<=50K 23787 >50K 7835 Name: income, dtype: int64
# Randomly under sample the majority class
rus1 = RandomUnderSampler(random_state=40)
X_train_rus1, y_train_rus1 = rus1.fit_resample(X_train_dropNA_encoded, y_train_dropNA_encoded)
# Check the number of records after random undersampling
print(sorted(Counter(y_train_rus1).items()))
[(' <=50K', 7835), (' >50K', 7835)]
We can see that, after random undersampling the majority class has been deceased to 7835 which is the same as the minority class.
# now let's run the Random Forest model on random under-sampled dataset
rf = RandomForestClassifier()
rus_model1 = rf.fit(X_train_rus1, y_train_rus1)
rus_prediction1 = rus_model1.predict(X_test_dropNA_encoded)
# Check the model performance
print(classification_report(y_test_dropNA_encoded, rus_prediction1))
precision recall f1-score support
<=50K 0.94 0.80 0.86 10186
>50K 0.58 0.83 0.68 3367
accuracy 0.81 13553
macro avg 0.76 0.82 0.77 13553
weighted avg 0.85 0.81 0.82 13553
We can see that recall has been 'significantly' increased for the minority class from 63% base to 83% through random under-sampling. Whereas the accuracy rate is also pretty high 81%.
NearMiss from imblearn library uses the KNN(K Nearest Neighbours) to do the undersampling. Based on the documentation of imblearn library, there are 3 versions of NearMiss algorithms:
We are using version 3, where for each datapoint in minority class K nearest neighbours are sampled. Then, for each majority datapoints that have been sampled, we calculate the average distance between their N nearest neighbours. The the datapoints with largest average distance are taken as sample.
y_train_dropNA_encoded.value_counts()
<=50K 23787 >50K 7835 Name: income, dtype: int64
# Under-sampling the majority class using NearMiss
nearmiss1 = NearMiss(version=3)
X_train_nearmiss1, y_train_nearmiss1 = nearmiss1.fit_resample(X_train_dropNA_encoded, y_train_dropNA_encoded)
# Check the number of records after random undersampling
print(sorted(Counter(y_train_nearmiss1).items()))
[(' <=50K', 7835), (' >50K', 7835)]
We can see that, similar to random undersampling, the majority class has been deceased to 7835 which is the same as the minority class.
# now let's run the Random Forest model on NearMiss under-sampled dataset
rf = RandomForestClassifier()
nearmiss_model1 = rf.fit(X_train_nearmiss1, y_train_nearmiss1)
nearmiss_prediction1 = nearmiss_model1.predict(X_test_dropNA_encoded)
# Check the model performance
print(classification_report(y_test_dropNA_encoded, nearmiss_prediction1))
precision recall f1-score support
<=50K 0.93 0.81 0.87 10186
>50K 0.59 0.82 0.69 3367
accuracy 0.81 13553
macro avg 0.76 0.82 0.78 13553
weighted avg 0.85 0.81 0.82 13553
We can see that the recall is slightly lower 82% than the recall in random undersampling.